#!/usr/local/bin/python
# -*- encoding: utf-8 -*-
# unicode2ascii.py
"""Convert many unicode characters to ascii characters that are like them.

I want to collate names, with the property that a last name starting with
O-umlaut will be in with the last name's starting with O.  Horrors!

So I want that many Latin-1 characters have their umlaute's, etc., stripped.
Some of it can be done automatically but some needs to be done by hand, that
I can tell.
"""
__version__='1.0.1'
__author__='Jim Hefferon: ftpmaint at tug.ctan.org'
__date__='2008-July-15'
__notes__="""As sources, used effbot's web site, and
  http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/251871
and
  man uni2ascii
"""
import os, os.path, sys, re

import unicodedata

# These characters that are not done automatically by NFKD, and
# have a name starting with "LATIN".  Some of these I found on the interwebs,
# but some I did by eye.  Corrections or additions appreciated.
EXTRA_LATIN_NAMES={  
    # First are ones I got off the interweb
    u"\N{LATIN CAPITAL LETTER O WITH STROKE}": u"O",
    u"\N{LATIN SMALL LETTER A WITH GRAVE}": u"a",
    u"\N{LATIN SMALL LETTER A WITH ACUTE}": u"a",
    u"\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}": u"a",
    u"\N{LATIN SMALL LETTER A WITH TILDE}": u"a",
    u"\N{LATIN SMALL LETTER A WITH DIAERESIS}": u"a",
    u"\N{LATIN SMALL LETTER A WITH RING ABOVE}": u"a",
    u"\N{LATIN SMALL LETTER C WITH CEDILLA}": u"c",
    u"\N{LATIN SMALL LETTER E WITH GRAVE}": u"e",
    u"\N{LATIN SMALL LETTER E WITH ACUTE}": u"e",
    u"\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}": u"e",
    u"\N{LATIN SMALL LETTER E WITH DIAERESIS}": u"e",
    u"\N{LATIN SMALL LETTER I WITH GRAVE}": u"i",
    u"\N{LATIN SMALL LETTER I WITH ACUTE}": u"i",
    u"\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}": u"i",
    u"\N{LATIN SMALL LETTER I WITH DIAERESIS}": u"i",
    u"\N{LATIN SMALL LETTER N WITH TILDE}": u"n",
    u"\N{LATIN SMALL LETTER O WITH GRAVE}": u"o",
    u"\N{LATIN SMALL LETTER O WITH ACUTE}": u"o",
    u"\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}": u"o",
    u"\N{LATIN SMALL LETTER O WITH TILDE}": u"o",
    u"\N{LATIN SMALL LETTER O WITH DIAERESIS}": u"o",
    u"\N{LATIN SMALL LETTER O WITH STROKE}": u"o",
    u"\N{LATIN SMALL LETTER U WITH GRAVE}": u"u",
    u"\N{LATIN SMALL LETTER U WITH ACUTE}": u"u",
    u"\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}": u"u",
    u"\N{LATIN SMALL LETTER U WITH DIAERESIS}": u"u",
    u"\N{LATIN SMALL LETTER Y WITH ACUTE}": u"y",
    u"\N{LATIN SMALL LETTER Y WITH DIAERESIS}": u"y",
    u"\N{LATIN SMALL LETTER A WITH MACRON}": u"a",
    u"\N{LATIN SMALL LETTER A WITH BREVE}": u"a",
    u"\N{LATIN SMALL LETTER C WITH ACUTE}": u"c",
    u"\N{LATIN SMALL LETTER C WITH CIRCUMFLEX}": u"c",
    u"\N{LATIN SMALL LETTER E WITH MACRON}": u"e",
    u"\N{LATIN SMALL LETTER E WITH BREVE}": u"e",
    u"\N{LATIN SMALL LETTER G WITH CIRCUMFLEX}": u"g",
    u"\N{LATIN SMALL LETTER G WITH BREVE}": u"g",
    u"\N{LATIN SMALL LETTER G WITH CEDILLA}": u"g",
    u"\N{LATIN SMALL LETTER H WITH CIRCUMFLEX}": u"h",
    u"\N{LATIN SMALL LETTER I WITH TILDE}": u"i",
    u"\N{LATIN SMALL LETTER I WITH MACRON}": u"i",
    u"\N{LATIN SMALL LETTER I WITH BREVE}": u"i",
    u"\N{LATIN SMALL LETTER J WITH CIRCUMFLEX}": u"j",
    u"\N{LATIN SMALL LETTER K WITH CEDILLA}": u"k",
    u"\N{LATIN SMALL LETTER L WITH ACUTE}": u"l",
    u"\N{LATIN SMALL LETTER L WITH CEDILLA}": u"l",
    u"\N{LATIN CAPITAL LETTER L WITH STROKE}": u"L",
    u"\N{LATIN SMALL LETTER L WITH STROKE}": u"l",
    u"\N{LATIN SMALL LETTER N WITH ACUTE}": u"n",
    u"\N{LATIN SMALL LETTER N WITH CEDILLA}": u"n",
    u"\N{LATIN SMALL LETTER O WITH MACRON}": u"o",
    u"\N{LATIN SMALL LETTER O WITH BREVE}": u"o",
    u"\N{LATIN SMALL LETTER R WITH ACUTE}": u"r",
    u"\N{LATIN SMALL LETTER R WITH CEDILLA}": u"r",
    u"\N{LATIN SMALL LETTER S WITH ACUTE}": u"s",
    u"\N{LATIN SMALL LETTER S WITH CIRCUMFLEX}": u"s",
    u"\N{LATIN SMALL LETTER S WITH CEDILLA}": u"s",
    u"\N{LATIN SMALL LETTER T WITH CEDILLA}": u"t",
    u"\N{LATIN SMALL LETTER U WITH TILDE}": u"u",
    u"\N{LATIN SMALL LETTER U WITH MACRON}": u"u",
    u"\N{LATIN SMALL LETTER U WITH BREVE}": u"u",
    u"\N{LATIN SMALL LETTER U WITH RING ABOVE}": u"u",
    u"\N{LATIN SMALL LETTER W WITH CIRCUMFLEX}": u"w",
    u"\N{LATIN SMALL LETTER Y WITH CIRCUMFLEX}": u"y",
    u"\N{LATIN SMALL LETTER Z WITH ACUTE}": u"z",
    u"\N{LATIN SMALL LETTER W WITH GRAVE}": u"w",
    u"\N{LATIN SMALL LETTER W WITH ACUTE}": u"w",
    u"\N{LATIN SMALL LETTER W WITH DIAERESIS}": u"w",
    u"\N{LATIN SMALL LETTER Y WITH GRAVE}": u"y",
    # Below are the ones that failed automated conversion
    u'\N{LATIN CAPITAL LETTER AE}': u'AE',
    u'\N{LATIN CAPITAL LETTER ETH}': u'D',
    u'\N{LATIN CAPITAL LETTER O WITH STROKE}': u'O',
    u'\N{LATIN CAPITAL LETTER THORN}': u'TH',
    u'\N{LATIN SMALL LETTER SHARP S}': u'ss',
    u'\N{LATIN SMALL LETTER AE}': u'ae',
    u'\N{LATIN SMALL LETTER ETH}': u'd',
    u'\N{LATIN SMALL LETTER O WITH STROKE}': u'o',
    u'\N{LATIN SMALL LETTER THORN}': 'th',
    u'\N{LATIN CAPITAL LETTER D WITH STROKE}': u'D',
    u'\N{LATIN SMALL LETTER D WITH STROKE}': u'd',
    u'\N{LATIN CAPITAL LETTER H WITH STROKE}': u'H',
    u'\N{LATIN SMALL LETTER H WITH STROKE}': u'h',
    u'\N{LATIN SMALL LETTER DOTLESS I}': u'i',
    u'\N{LATIN SMALL LETTER KRA}': u'q',
    u'\N{LATIN CAPITAL LETTER L WITH STROKE}': u'L',
    u'\N{LATIN SMALL LETTER L WITH STROKE}': u'l',
    u'\N{LATIN CAPITAL LETTER ENG}': u'N',
    u'\N{LATIN SMALL LETTER ENG}': u'n',
    u'\N{LATIN CAPITAL LIGATURE OE}': u'OE',
    u'\N{LATIN SMALL LIGATURE OE}': u'oe',
    u'\N{LATIN CAPITAL LETTER T WITH STROKE}': u'T',
    u'\N{LATIN SMALL LETTER T WITH STROKE}': u't',
    u'\N{LATIN SMALL LETTER B WITH STROKE}': u'b',
    u'\N{LATIN CAPITAL LETTER B WITH HOOK}': u'B',
    u'\N{LATIN CAPITAL LETTER B WITH TOPBAR}': u'B',
    u'\N{LATIN SMALL LETTER B WITH TOPBAR}': u'b',
    # u'\N{LATIN CAPITAL LETTER TONE SIX}': u'',  # ?B
    # u'\N{LATIN SMALL LETTER TONE SIX}': u'',  # ?b
    u'\N{LATIN CAPITAL LETTER OPEN O}': u'O',
    u'\N{LATIN CAPITAL LETTER C WITH HOOK}': u'C',
    u'\N{LATIN SMALL LETTER C WITH HOOK}': u'c',
    u'\N{LATIN CAPITAL LETTER AFRICAN D}': u'D',
    u'\N{LATIN CAPITAL LETTER D WITH HOOK}': u'D',
    u'\N{LATIN CAPITAL LETTER D WITH TOPBAR}': u'D',
    u'\N{LATIN SMALL LETTER D WITH TOPBAR}': u'd',
    # u'\N{LATIN SMALL LETTER TURNED DELTA}': u'',
    u'\N{LATIN CAPITAL LETTER REVERSED E}': u'E',
    # u'\N{LATIN CAPITAL LETTER SCHWA}': u'',
    u'\N{LATIN CAPITAL LETTER OPEN E}': u'E',
    u'\N{LATIN CAPITAL LETTER F WITH HOOK}': u'F',
    u'\N{LATIN SMALL LETTER F WITH HOOK}': u'f',
    u'\N{LATIN CAPITAL LETTER G WITH HOOK}': u'G',
    # u'\N{LATIN CAPITAL LETTER GAMMA}': u'',
    u'\N{LATIN SMALL LETTER HV}': u'hv',
    u'\N{LATIN CAPITAL LETTER IOTA}': u'i',
    u'\N{LATIN CAPITAL LETTER I WITH STROKE}': u'I',
    u'\N{LATIN CAPITAL LETTER K WITH HOOK}': u'K',
    u'\N{LATIN SMALL LETTER K WITH HOOK}': u'k',
    u'\N{LATIN SMALL LETTER L WITH BAR}': u'l',
    # u'\N{LATIN SMALL LETTER LAMBDA WITH STROKE}': u'',
    # u'\N{LATIN CAPITAL LETTER TURNED M}': u'',
    u'\N{LATIN CAPITAL LETTER N WITH LEFT HOOK}': u'N',
    u'\N{LATIN SMALL LETTER N WITH LONG RIGHT LEG}': u'N',
    u'\N{LATIN CAPITAL LETTER O WITH MIDDLE TILDE}': u'O',
    u'\N{LATIN CAPITAL LETTER OI}': u'OI',
    u'\N{LATIN SMALL LETTER OI}': u'oi',
    u'\N{LATIN CAPITAL LETTER P WITH HOOK}': u'P',
    u'\N{LATIN SMALL LETTER P WITH HOOK}': u'p',
    # u'\N{LATIN LETTER YR}': u'',
    # u'\N{LATIN CAPITAL LETTER TONE TWO}': u'',
    # u'\N{LATIN SMALL LETTER TONE TWO}': u'',
    u'\N{LATIN CAPITAL LETTER ESH}': u'SH',
    # u'\N{LATIN LETTER REVERSED ESH LOOP}': u'',
    u'\N{LATIN SMALL LETTER T WITH PALATAL HOOK}': u't',
    u'\N{LATIN CAPITAL LETTER T WITH HOOK}': u'T',
    u'\N{LATIN SMALL LETTER T WITH HOOK}': u't',
    u'\N{LATIN CAPITAL LETTER T WITH RETROFLEX HOOK}': u'T',
    # u'\N{LATIN CAPITAL LETTER UPSILON}': u'',
    u'\N{LATIN CAPITAL LETTER V WITH HOOK}': u'V',
    u'\N{LATIN CAPITAL LETTER Y WITH HOOK}': u'Y',
    u'\N{LATIN SMALL LETTER Y WITH HOOK}': u'y',
    u'\N{LATIN CAPITAL LETTER Z WITH STROKE}': u'Z',
    u'\N{LATIN SMALL LETTER Z WITH STROKE}': u'z',
    u'\N{LATIN CAPITAL LETTER EZH}': u'S',
    # u'\N{LATIN CAPITAL LETTER EZH REVERSED}': u'',
    # u'\N{LATIN SMALL LETTER EZH REVERSED}': u'',
    u'\N{LATIN SMALL LETTER EZH WITH TAIL}': u's',
    # u'\N{LATIN LETTER TWO WITH STROKE}': u'',
    # u'\N{LATIN CAPITAL LETTER TONE FIVE}': u'',
    # u'\N{LATIN SMALL LETTER TONE FIVE}': u'',
    # u'\N{LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE}': u'',
    u'\N{LATIN LETTER WYNN}': u'w',
    # u'\N{LATIN LETTER DENTAL CLICK}': u'',
    # u'\N{LATIN LETTER LATERAL CLICK}': u'',
    # u'\N{LATIN LETTER ALVEOLAR CLICK}': u'',
    # u'\N{LATIN LETTER RETROFLEX CLICK}': u'',
    # u'\N{LATIN SMALL LETTER TURNED E}': u'',
    u'\N{LATIN CAPITAL LETTER AE WITH MACRON}': u'AE',
    u'\N{LATIN SMALL LETTER AE WITH MACRON}': u'ae',
    u'\N{LATIN CAPITAL LETTER G WITH STROKE}': u'G',
    u'\N{LATIN SMALL LETTER G WITH STROKE}': u'g',
    u'\N{LATIN CAPITAL LETTER EZH WITH CARON}': u'S',
    u'\N{LATIN SMALL LETTER EZH WITH CARON}': u's',
    u'\N{LATIN CAPITAL LETTER HWAIR}': u'HW',
    u'\N{LATIN CAPITAL LETTER WYNN}': u'W',
    u'\N{LATIN CAPITAL LETTER AE WITH ACUTE}': u'AE',
    u'\N{LATIN SMALL LETTER AE WITH ACUTE}': u'AE',
    u'\N{LATIN CAPITAL LETTER O WITH STROKE AND ACUTE}': u'O',
    u'\N{LATIN SMALL LETTER O WITH STROKE AND ACUTE}': u'o',
    u'\N{LATIN CAPITAL LETTER YOGH}': u'J',
    u'\N{LATIN SMALL LETTER YOGH}': u'j',
    u'\N{LATIN CAPITAL LETTER N WITH LONG RIGHT LEG}': u'N',
    u'\N{LATIN SMALL LETTER D WITH CURL}': u'd',
    u'\N{LATIN CAPITAL LETTER OU}': u'OU',
    u'\N{LATIN SMALL LETTER OU}': u'ou',
    u'\N{LATIN CAPITAL LETTER Z WITH HOOK}': u'Z',
    u'\N{LATIN SMALL LETTER Z WITH HOOK}': u'z',
    u'\N{LATIN SMALL LETTER L WITH CURL}': u'l',
    u'\N{LATIN SMALL LETTER N WITH CURL}': u'n',
    u'\N{LATIN SMALL LETTER T WITH CURL}': u't',
    u'\N{LATIN SMALL LETTER DOTLESS J}': u'j',
    u'\N{LATIN SMALL LETTER DB DIGRAPH}': u'db',
    u'\N{LATIN SMALL LETTER QP DIGRAPH}': u'qp',
    u'\N{LATIN CAPITAL LETTER A WITH STROKE}': u'A',
    u'\N{LATIN CAPITAL LETTER C WITH STROKE}': u'C',
    u'\N{LATIN SMALL LETTER C WITH STROKE}': u'C',
    u'\N{LATIN CAPITAL LETTER L WITH BAR}': u'L',
    u'\N{LATIN CAPITAL LETTER T WITH DIAGONAL STROKE}': u'T',
    u'\N{LATIN SMALL LETTER S WITH SWASH TAIL}': u'S',
    u'\N{LATIN SMALL LETTER Z WITH SWASH TAIL}': u'Z',
    # u'\N{LATIN CAPITAL LETTER GLOTTAL STOP}': u'',
    # u'\N{LATIN SMALL LETTER TURNED A}': u'',
    # u'\N{LATIN SMALL LETTER ALPHA}': u'',
    # u'\N{LATIN SMALL LETTER TURNED ALPHA}': u'',
    u'\N{LATIN SMALL LETTER B WITH HOOK}': u'b',
    u'\N{LATIN SMALL LETTER OPEN O}': u'o',
    u'\N{LATIN SMALL LETTER C WITH CURL}': u'c',
    u'\N{LATIN SMALL LETTER D WITH TAIL}': u'd',
    u'\N{LATIN SMALL LETTER D WITH HOOK}': u'd',
    # u'\N{LATIN SMALL LETTER REVERSED E}': u'',
    # u'\N{LATIN SMALL LETTER SCHWA}': u'',
    # u'\N{LATIN SMALL LETTER SCHWA WITH HOOK}': u'',
    u'\N{LATIN SMALL LETTER OPEN E}': u'e',
    # u'\N{LATIN SMALL LETTER REVERSED OPEN E}': u'',
    # u'\N{LATIN SMALL LETTER REVERSED OPEN E WITH HOOK}': u'',
    # u'\N{LATIN SMALL LETTER CLOSED REVERSED OPEN E}': u'',
    u'\N{LATIN SMALL LETTER DOTLESS J WITH STROKE}': u'j',
    u'\N{LATIN SMALL LETTER G WITH HOOK}': u'g',
    u'\N{LATIN SMALL LETTER SCRIPT G}': u'g',
    u'\N{LATIN LETTER SMALL CAPITAL G}': u'G',
    # u'\N{LATIN SMALL LETTER GAMMA}': u'',
    # u'\N{LATIN SMALL LETTER RAMS HORN}': u'',
    # u'\N{LATIN SMALL LETTER TURNED H}': u'',
    u'\N{LATIN SMALL LETTER H WITH HOOK}': u'h',
    u'\N{LATIN SMALL LETTER HENG WITH HOOK}': u'h',
    u'\N{LATIN SMALL LETTER I WITH STROKE}': u'i',
    # u'\N{LATIN SMALL LETTER IOTA}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL I}': u'I',
    u'\N{LATIN SMALL LETTER L WITH MIDDLE TILDE}': u'L',
    u'\N{LATIN SMALL LETTER L WITH BELT}': u'L',
    u'\N{LATIN SMALL LETTER L WITH RETROFLEX HOOK}': u'L',
    # u'\N{LATIN SMALL LETTER LEZH}': u'',
    # u'\N{LATIN SMALL LETTER TURNED M}': u'',
    # u'\N{LATIN SMALL LETTER TURNED M WITH LONG LEG}': u'',
    u'\N{LATIN SMALL LETTER M WITH HOOK}': u'm',
    u'\N{LATIN SMALL LETTER N WITH LEFT HOOK}': u'n',
    u'\N{LATIN SMALL LETTER N WITH RETROFLEX HOOK}': u'n',
    u'\N{LATIN LETTER SMALL CAPITAL N}': u'N',
    u'\N{LATIN SMALL LETTER BARRED O}': u'o',
    u'\N{LATIN LETTER SMALL CAPITAL OE}': u'OE',
    # u'\N{LATIN SMALL LETTER CLOSED OMEGA}': u'',
    # u'\N{LATIN SMALL LETTER PHI}': u'',
    # u'\N{LATIN SMALL LETTER TURNED R}': u'',
    # u'\N{LATIN SMALL LETTER TURNED R WITH LONG LEG}': u'',
    # u'\N{LATIN SMALL LETTER TURNED R WITH HOOK}': u'',
    u'\N{LATIN SMALL LETTER R WITH LONG LEG}': u'r',
    u'\N{LATIN SMALL LETTER R WITH TAIL}': u'r',
    u'\N{LATIN SMALL LETTER R WITH FISHHOOK}': u'r',
    # u'\N{LATIN SMALL LETTER REVERSED R WITH FISHHOOK}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL R}': u'R',
    # u'\N{LATIN LETTER SMALL CAPITAL INVERTED R}': u'',
    u'\N{LATIN SMALL LETTER S WITH HOOK}': u's',
    u'\N{LATIN SMALL LETTER ESH}': u'sh',
    u'\N{LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK}': u'j',
    # u'\N{LATIN SMALL LETTER SQUAT REVERSED ESH}': u'',
    u'\N{LATIN SMALL LETTER ESH WITH CURL}': u'sh',
    # u'\N{LATIN SMALL LETTER TURNED T}': u'',
    u'\N{LATIN SMALL LETTER T WITH RETROFLEX HOOK}': u't',
    u'\N{LATIN SMALL LETTER U BAR}': u'u',
    # u'\N{LATIN SMALL LETTER UPSILON}': u'',
    u'\N{LATIN SMALL LETTER V WITH HOOK}': u'v',
    # u'\N{LATIN SMALL LETTER TURNED V}': u'',
    # u'\N{LATIN SMALL LETTER TURNED W}': u'',
    # u'\N{LATIN SMALL LETTER TURNED Y}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL Y}': u'Y',
    u'\N{LATIN SMALL LETTER Z WITH RETROFLEX HOOK}': u'z',
    u'\N{LATIN SMALL LETTER Z WITH CURL}': u'z',
    u'\N{LATIN SMALL LETTER EZH}': u's',
    u'\N{LATIN SMALL LETTER EZH WITH CURL}': u's',
    # u'\N{LATIN LETTER GLOTTAL STOP}': u'',
    # u'\N{LATIN LETTER PHARYNGEAL VOICED FRICATIVE}': u'',
    # u'\N{LATIN LETTER INVERTED GLOTTAL STOP}': u'',
    u'\N{LATIN LETTER STRETCHED C}': u'c',
    # u'\N{LATIN LETTER BILABIAL CLICK}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL B}': u'B',
    u'\N{LATIN SMALL LETTER CLOSED OPEN E}': u'e',
    u'\N{LATIN LETTER SMALL CAPITAL G WITH HOOK}': u'G',
    u'\N{LATIN LETTER SMALL CAPITAL H}': u'H',
    u'\N{LATIN SMALL LETTER J WITH CROSSED-TAIL}': u'j',
    # u'\N{LATIN SMALL LETTER TURNED K}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL L}': u'L',
    u'\N{LATIN SMALL LETTER Q WITH HOOK}': u'q',
    # u'\N{LATIN LETTER GLOTTAL STOP WITH STROKE}': u'',
    # u'\N{LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE}': u'',
    # u'\N{LATIN SMALL LETTER DZ DIGRAPH}': u'',
    # u'\N{LATIN SMALL LETTER DEZH DIGRAPH}': u'',
    # u'\N{LATIN SMALL LETTER DZ DIGRAPH WITH CURL}': u'',
    # u'\N{LATIN SMALL LETTER TS DIGRAPH}': u'',
    # u'\N{LATIN SMALL LETTER TESH DIGRAPH}': u'',
    # u'\N{LATIN SMALL LETTER TC DIGRAPH WITH CURL}': u'',
    # u'\N{LATIN SMALL LETTER FENG DIGRAPH}': u'',
    # u'\N{LATIN SMALL LETTER LS DIGRAPH}': u'',
    # u'\N{LATIN SMALL LETTER LZ DIGRAPH}': u'',
    # u'\N{LATIN LETTER BILABIAL PERCUSSIVE}': u'',
    # u'\N{LATIN LETTER BIDENTAL PERCUSSIVE}': u'',
    # u'\N{LATIN SMALL LETTER TURNED H WITH FISHHOOK}': u'',
    # u'\N{LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL A}': u'A',
    u'\N{LATIN LETTER SMALL CAPITAL AE}': u'AE',
    # u'\N{LATIN SMALL LETTER TURNED AE}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL BARRED B}': u'B',
    u'\N{LATIN LETTER SMALL CAPITAL C}': u'C',
    u'\N{LATIN LETTER SMALL CAPITAL D}': u'D',
    u'\N{LATIN LETTER SMALL CAPITAL ETH}': u'D',
    u'\N{LATIN LETTER SMALL CAPITAL E}': u'E',
    # u'\N{LATIN SMALL LETTER TURNED OPEN E}': u'',
    # u'\N{LATIN SMALL LETTER TURNED I}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL J}': u'J',
    u'\N{LATIN LETTER SMALL CAPITAL K}': u'K',
    u'\N{LATIN LETTER SMALL CAPITAL L WITH STROKE}': u'L',
    u'\N{LATIN LETTER SMALL CAPITAL M}': u'M',
    # u'\N{LATIN LETTER SMALL CAPITAL REVERSED N}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL O}': u'O',
    u'\N{LATIN LETTER SMALL CAPITAL OPEN O}': u'O',
    # u'\N{LATIN SMALL LETTER SIDEWAYS O}': u'',
    # u'\N{LATIN SMALL LETTER SIDEWAYS OPEN O}': u'',
    # u'\N{LATIN SMALL LETTER SIDEWAYS O WITH STROKE}': u'',
    # u'\N{LATIN SMALL LETTER TURNED OE}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL OU}': u'OU',
    # u'\N{LATIN SMALL LETTER TOP HALF O}': u'',
    # u'\N{LATIN SMALL LETTER BOTTOM HALF O}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL P}': u'P',
    # u'\N{LATIN LETTER SMALL CAPITAL REVERSED R}': u'',
    # u'\N{LATIN LETTER SMALL CAPITAL TURNED R}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL T}': u'T',
    u'\N{LATIN LETTER SMALL CAPITAL U}': u'U',
    # u'\N{LATIN SMALL LETTER SIDEWAYS U}': u'',
    # u'\N{LATIN SMALL LETTER SIDEWAYS DIAERESIZED U}': u'',
    # u'\N{LATIN SMALL LETTER SIDEWAYS TURNED M}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL V}': u'V',
    u'\N{LATIN LETTER SMALL CAPITAL W}': u'W',
    u'\N{LATIN LETTER SMALL CAPITAL Z}': u'',
    u'\N{LATIN LETTER SMALL CAPITAL EZH}': u'S',
    # u'\N{LATIN LETTER VOICED LARYNGEAL SPIRANT}': u'',
    # u'\N{LATIN LETTER AIN}': u'',
    u'\N{LATIN SMALL LETTER UE}': u'ue',
    u'\N{LATIN SMALL LETTER B WITH MIDDLE TILDE}': u'b',
    u'\N{LATIN SMALL LETTER D WITH MIDDLE TILDE}': u'd',
    u'\N{LATIN SMALL LETTER F WITH MIDDLE TILDE}': u'f',
    u'\N{LATIN SMALL LETTER M WITH MIDDLE TILDE}': u'm',
    u'\N{LATIN SMALL LETTER N WITH MIDDLE TILDE}': u'n',
    u'\N{LATIN SMALL LETTER P WITH MIDDLE TILDE}': u'p',
    u'\N{LATIN SMALL LETTER R WITH MIDDLE TILDE}': u'r',
    u'\N{LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE}': u'r',
    u'\N{LATIN SMALL LETTER S WITH MIDDLE TILDE}': u's',
    u'\N{LATIN SMALL LETTER T WITH MIDDLE TILDE}': u't',
    u'\N{LATIN SMALL LETTER Z WITH MIDDLE TILDE}': u'z',
    # u'\N{LATIN SMALL LETTER TURNED G}': u'',
    # u'\N{LATIN SMALL LETTER INSULAR G}': u'',
    u'\N{LATIN SMALL LETTER TH WITH STRIKETHROUGH}': u'th',
    u'\N{LATIN SMALL CAPITAL LETTER I WITH STROKE}': u'I',
    # u'\N{LATIN SMALL LETTER IOTA WITH STROKE}': u'',
    u'\N{LATIN SMALL LETTER P WITH STROKE}': u'p',
    u'\N{LATIN SMALL CAPITAL LETTER U WITH STROKE}': u'U',
    # u'\N{LATIN SMALL LETTER UPSILON WITH STROKE}': u'',
    u'\N{LATIN SMALL LETTER B WITH PALATAL HOOK}': u'b',
    u'\N{LATIN SMALL LETTER D WITH PALATAL HOOK}': u'd',
    u'\N{LATIN SMALL LETTER F WITH PALATAL HOOK}': u'f',
    u'\N{LATIN SMALL LETTER G WITH PALATAL HOOK}': u'g',
    u'\N{LATIN SMALL LETTER K WITH PALATAL HOOK}': u'k',
    u'\N{LATIN SMALL LETTER L WITH PALATAL HOOK}': u'l',
    u'\N{LATIN SMALL LETTER M WITH PALATAL HOOK}': u'm',
    u'\N{LATIN SMALL LETTER N WITH PALATAL HOOK}': u'n',
    u'\N{LATIN SMALL LETTER P WITH PALATAL HOOK}': u'p',
    u'\N{LATIN SMALL LETTER R WITH PALATAL HOOK}': u'r',
    u'\N{LATIN SMALL LETTER S WITH PALATAL HOOK}': u's',
    u'\N{LATIN SMALL LETTER ESH WITH PALATAL HOOK}': u'sh',
    u'\N{LATIN SMALL LETTER V WITH PALATAL HOOK}': u'v',
    u'\N{LATIN SMALL LETTER X WITH PALATAL HOOK}': u'x',
    u'\N{LATIN SMALL LETTER Z WITH PALATAL HOOK}': u'z',
    u'\N{LATIN SMALL LETTER A WITH RETROFLEX HOOK}': u'a',
    # u'\N{LATIN SMALL LETTER ALPHA WITH RETROFLEX HOOK}': u'',
    u'\N{LATIN SMALL LETTER D WITH HOOK AND TAIL}': u'd',
    u'\N{LATIN SMALL LETTER E WITH RETROFLEX HOOK}': u'e',
    u'\N{LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK}': u'e',
    u'\N{LATIN SMALL LETTER REVERSED OPEN E WITH RETROFLEX HOOK}': u'e',
    # u'\N{LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK}': u'',
    u'\N{LATIN SMALL LETTER I WITH RETROFLEX HOOK}': u'i',
    u'\N{LATIN SMALL LETTER OPEN O WITH RETROFLEX HOOK}': u'o',
    u'\N{LATIN SMALL LETTER ESH WITH RETROFLEX HOOK}': u'sh',
    u'\N{LATIN SMALL LETTER U WITH RETROFLEX HOOK}': u'u',
    u'\N{LATIN SMALL LETTER EZH WITH RETROFLEX HOOK}': u's',
    # u'\N{LATIN SUBSCRIPT SMALL LETTER SCHWA}': u'',
    # u'\N{LATIN CROSS}': u''
    }

# Additional ones; see "man uni2ascii"
UNI2ASCII_CONVERSIONS={
    u'\N{NO-BREAK SPACE}': u' ',
    u'\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'"',
    u'\N{SOFT HYPHEN}': u'',  # Controversial: see http://www.cs.tut.fi/~jkorpela/shy.html
    u'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'"',
    u'\N{ETHIOPIC WORDSPACE}': u' ',
    u'\N{OGHAM SPACE MARK}': u' ',
    u'\N{EN QUAD}': u' ',
    u'\N{EM QUAD}': u' ',
    u'\N{EN SPACE}': u' ',
    u'\N{EM SPACE}': u' ',
    u'\N{THREE-PER-EM SPACE}': u' ',
    u'\N{FOUR-PER-EM SPACE}': u' ',
    u'\N{SIX-PER-EM SPACE}': u' ',
    u'\N{FIGURE SPACE}': u' ',
    u'\N{PUNCTUATION SPACE}': u' ',
    u'\N{THIN SPACE}': u' ',
    u'\N{HAIR SPACE}': u' ',
    u'\N{ZERO WIDTH SPACE}': u' ',
    u'\N{ZERO WIDTH NO-BREAK SPACE}': u' ',
    u'\N{HYPHEN}': u'-',
    u'\N{NON-BREAKING HYPHEN}': u'-',
    u'\N{FIGURE DASH}': u'-',
    u'\N{EN DASH}': u'-',
    u'\N{EM DASH}': u'-',
    u'\N{LEFT SINGLE QUOTATION MARK}': u'`',
    u'\N{RIGHT SINGLE QUOTATION MARK}': u"'",
    u'\N{SINGLE LOW-9 QUOTATION MARK}': u'`',
    u'\N{SINGLE HIGH-REVERSED-9 QUOTATION MARK}': u'`',
    u'\N{LEFT DOUBLE QUOTATION MARK}': u'"',
    u'\N{RIGHT DOUBLE QUOTATION MARK}': u'"',
    u'\N{DOUBLE LOW-9 QUOTATION MARK}': u'"',
    u'\N{DOUBLE HIGH-REVERSED-9 QUOTATION MARK}': u'"',
    u'\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}': u'`',
    u'\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}': u"'",
    u'\N{LOW ASTERISK}': u'*',
    u'\N{MINUS SIGN}': u'-',
    u'\N{ASTERISK OPERATOR}': u'*',
    u'\N{BOX DRAWINGS LIGHT HORIZONTAL}': u'-',
    u'\N{BOX DRAWINGS HEAVY HORIZONTAL}': u'-',
    u'\N{BOX DRAWINGS LIGHT VERTICAL}': u'|',
    u'\N{BOX DRAWINGS HEAVY VERTICAL}': u'|',
    u'\N{HEAVY ASTERISK}': u'*',
    u'\N{HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT}': u'"',
    u'\N{HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT}': u'"',
    u'\N{IDEOGRAPHIC SPACE}': u' ',
    u'\N{SMALL AMPERSAND}': u'&',
    u'\N{SMALL ASTERISK}': u'*',
    u'\N{SMALL PLUS SIGN}': u'+',
    u'\N{CENT SIGN}': u'cent',
    u'\N{POUND SIGN}': u'pound',
    u'\N{YEN SIGN}': u'yen',
    u'\N{COPYRIGHT SIGN}': u'(c)',
    u'\N{REGISTERED SIGN}': u'(R)',
    u'\N{VULGAR FRACTION ONE QUARTER}': u'1/4',
    u'\N{VULGAR FRACTION ONE HALF}': u'1/2',
    u'\N{VULGAR FRACTION THREE QUARTERS}': u'3/4',
    # u'\N{CAPITAL LETTER ASH}': u'AE',
    u'\N{LATIN SMALL LETTER SHARP S}': u'ss',
    # u'\N{SMALL LETTER ASH}': u'ae',
    u'\N{LATIN CAPITAL LIGATURE IJ}': u'IJ',
    u'\N{LATIN SMALL LIGATURE IJ}': u'ij',
    u'\N{LATIN CAPITAL LIGATURE OE}': u'OE',
    u'\N{LATIN SMALL LIGATURE oe}': u'oe',
    u'\N{LATIN CAPITAL LETTER DZ}': u'DZ',
    u'\N{LATIN CAPITAL LETTER DZ WITH CARON}': u'DZ',
    u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z}': u'Dz',
    u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON}': u'Dz',
    u'\N{LATIN SMALL LETTER DZ}': u'dz',
    u'\N{LATIN SMALL LETTER TS DIGRAPH}': u'ts',
    u'\N{HORIZONTAL ELLIPSIS}': u'...',
    u'\N{MIDLINE HORIZONTAL ELLIPSIS}': u'...',
    u'\N{LEFTWARDS ARROW}': u'<-',
    u'\N{RIGHTWARDS ARROW}': u'->',
    u'\N{LEFTWARDS DOUBLE ARROW}': u'<=',
    u'\N{RIGHTWARDS DOUBLE ARROW}': u'=>',
    }

# More from "man uni2ascii", in a different category.
EXTRA_CHARACTERS={
    u'\N{ACUTE ACCENT}': u"'",
    u'\N{BROKEN BAR}': u'|',
    # u'\N{CEDILLA}': u'{cedilla}',
    u'\N{CENT SIGN}': u' cents ',
    u'\N{COPYRIGHT SIGN}': u'(C)',
    u'\N{CURRENCY SIGN}': u' currency ',
    u'\N{DEGREE SIGN}': u' degrees ',
    # u'\N{DIAERESIS}': u'{umlaut}',
    u'\N{DIVISION SIGN}': u'/',
    # u'\N{FEMININE ORDINAL INDICATOR}': u'{^a}',
    u'\N{INVERTED EXCLAMATION MARK}': u'!',
    u'\N{INVERTED QUESTION MARK}': u'?',
    # wrong? u'\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'<<',
    u'\N{MACRON}': u'_',
    # u'\N{MASCULINE ORDINAL INDICATOR}': u'{^o}',
    u'\N{MICRO SIGN}': u'micro',
    u'\N{MIDDLE DOT}': u'*',
    u'\N{MULTIPLICATION SIGN}': u'*',
    u'\N{NOT SIGN}': u'not',
    u'\N{PILCROW SIGN}': u'paragraph',
    u'\N{PLUS-MINUS SIGN}': u'+/-',
    u'\N{POUND SIGN}': u'pound',
    u'\N{REGISTERED SIGN}': u'(R)',
    # wrong? u'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'>>',
    u'\N{SECTION SIGN}': u'section',
    u'\N{SOFT HYPHEN}': u'',
    u'\N{SUPERSCRIPT ONE}': u'^1',
    u'\N{SUPERSCRIPT THREE}': u'^3',
    u'\N{SUPERSCRIPT TWO}': u'^2',
    u'\N{VULGAR FRACTION ONE HALF}': u'1/2',
    u'\N{VULGAR FRACTION ONE QUARTER}': u'1/4',
    u'\N{VULGAR FRACTION THREE QUARTERS}': u'3/4',
    u'\N{YEN SIGN}': u'yen'
    }

def makeTranslator():
    """Return the translation dictionary.
    """
    d={}
    # First do what can be done automatically
    for i in range(0xffff):
        u=unichr(i)
        try:
            n=unicodedata.name(u)
            if n.startswith('LATIN '):
                k=unicodedata.normalize('NFKD', u).encode('ASCII', 'ignore')
                if k:
                    d[i]=unicode(k)  # i=ord(u)
        except ValueError:
            pass
    # Next, add some by-hand ones (may be overlap so order matters)
    for m in [EXTRA_LATIN_NAMES,EXTRA_CHARACTERS,UNI2ASCII_CONVERSIONS]:
        for i in m:
            try:
                d[ord(i)]=unicode(m[i])
            except Exception, err:
                pass
    return d

translator=makeTranslator()
def unicode2ascii(s):
    """Convert a unicode string to a rough ascii equivalent.  Shh! Don't
    tell anyone. :-)
      s  unicode string
    """
    return s.translate(translator)


if __name__ == "__main__":
    text = u"""

    "Jo, når'n da ha gått ett stôck te, så kommer'n te e å,
    å i åa ä e ö."
    "Vasa", sa'n.
    "Å i åa ä e ö", sa ja.
    "Men va i all ti ä dä ni säjer, a, o?", sa'n.
    "D'ä e å, vett ja", skrek ja, för ja ble rasen, "å i åa
    ä e ö, hörer han lite, d'ä e å, å i åa ä e ö."
    "A, o, ö", sa'n å dämmä geck'en.
    Jo, den va nôe te dum den.

    (taken from the short story "Dumt fôlk" in Gustaf Fröding's
    "Räggler å paschaser på våra mål tå en bonne" (1895).

    """
    for i in translator:
        if not((translator[i] is None)
               or (type(translator[i])==type(''))
               or (type(translator[i]==type(u'')))):
            print "The translation is wrong"
    
    print unicode2ascii(text)
